Abstract
A continuación, se presenta un análisis mediante agrupación por clústers de los FRE de acuerdo a variables de desempeño y complejidad de los mismos
require(readxl)
require(skimr)
require(PerformanceAnalytics)
require(factoextra)
require(patchwork)
require(tidyverse); theme_set(theme_bw())
source(file.path('src', 'models', '900_funcionesAlmacenamientoGrafico.R'), encoding = 'UTF-8')
fig_path <- file.path('figures', '011_clasificacion')
data <-
read_excel(file.path(
'data',
'raw',
'ClasificacionFRE',
'variablesClasificacionFRE.xlsx'
), na = '-') %>%
mutate(Departamento...2 = str_to_title(Departamento...2))
## New names:
## * Departamento -> Departamento...1
## * Departamento -> Departamento...2
data %>%
select(!contains('Departamento')) %>%
chart.Correlation(., histogram = TRUE, pch = 19)
Correlación entre variables de PCA
if (knitr::is_html_output()) {
skimr::skim(data)
}
| Name | data |
| Number of rows | 33 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Departamento…2 | 0 | 1 | 4 | 40 | 0 | 33 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Departamento…1 | 0 | 1.00 | 1.700000e+01 | 9.670000e+00 | 1.000000e+00 | 9.000000e+00 | 1.700000e+01 | 2.500000e+01 | 3.300000e+01 | ▇▇▇▇▇ |
| Presupuesto_2020 | 0 | 1.00 | 9.214848e+11 | 8.500217e+11 | 1.020000e+11 | 3.770000e+11 | 8.160000e+11 | 1.239000e+12 | 4.383000e+12 | ▇▃▁▁▁ |
| Presupuesto_Salud_2020 | 1 | 0.97 | 1.092351e+11 | 1.778047e+11 | 1.021624e+09 | 3.379498e+10 | 5.854786e+10 | 1.053443e+11 | 9.783884e+11 | ▇▁▁▁▁ |
| N_productosCD_2021 | 0 | 1.00 | 3.721000e+01 | 6.164000e+01 | 0.000000e+00 | 0.000000e+00 | 5.000000e+00 | 3.300000e+01 | 2.080000e+02 | ▇▁▁▁▁ |
| Cumplimiento_A1_2020-2021-06 | 0 | 1.00 | 9.800000e-01 | 7.000000e-02 | 7.500000e-01 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | ▁▁▁▁▇ |
| Cumplimiento_A2_2020-2021-06 | 2 | 0.94 | 5.200000e-01 | 3.900000e-01 | 0.000000e+00 | 1.100000e-01 | 6.100000e-01 | 8.600000e-01 | 1.000000e+00 | ▇▂▁▅▇ |
| PropPortafolio | 0 | 1.00 | 8.400000e-01 | 1.900000e-01 | 3.500000e-01 | 8.200000e-01 | 9.400000e-01 | 1.000000e+00 | 1.000000e+00 | ▁▁▁▁▇ |
| No_Inscritos | 2 | 0.94 | 1.593200e+02 | 2.144000e+02 | 1.000000e+00 | 2.650000e+01 | 7.800000e+01 | 2.220000e+02 | 1.054000e+03 | ▇▂▁▁▁ |
pca1 <- data %>%
drop_na() %>%
column_to_rownames('Departamento...2') %>%
select(!contains('Departamento')) %>%
prcomp(., scale = TRUE, center = TRUE)
plot(pca1, main = 'Distribución de varianza')
Distribución de las varianzas
gg1 <- fviz_pca_ind(pca1,
col.ind = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
) + labs(title = 'PCA de individuos')
gg1
Individuos representados en PCA
guardarGGplot(gg1, '001_fig_ind', 8, 6, fig_path)
gg2 <- fviz_pca_var(
pca1,
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
) + labs(title = 'PCA - Explicación de variables')
gg2
Explicación de variables en PCA
guardarGGplot(gg2, '002_fig_var', 8, 6, fig_path)
clean_data <- data %>%
drop_na() %>%
column_to_rownames('Departamento...2') %>%
select(!contains('Departamento'))
funClusters <- function(data, k) {
data %>%
kmeans(k)
}
k.values <- data.frame(k = 1:30)
k.values['TWITH'] <- map_dbl(k.values$k, ~funClusters(clean_data, .x)$tot.withinss)
gg3 <- k.values %>%
ggplot(aes(x = k, y = TWITH)) +
geom_point() + geom_line() +
geom_vline(xintercept = 4, lty = 'dashed', col = 'blue3') +
ylab('Suma de cuadrados \n dentro de clústers') +
xlab('N.° de Clusters, k')
gg3
Criterio de codo para clústers por Kmeans
guardarGGplot(gg3, '003_elbowlKmeans', 6, 4, fig_path)
g1 <- funClusters(clean_data, 4) %>%
fviz_cluster(., data = clean_data) +
theme_bw() + labs(title = NULL)
g1
Clústers Kmeans visualizados en primeros dos PC
guardarGGplot(g1, '004_clusterKmeans', 8, 5, fig_path)
g2 <- funClusters(clean_data, 4) %>%
fviz_cluster(., data = clean_data, axes = c(1, 2)) +
theme_bw() + labs(title = NULL)
g3 <- funClusters(clean_data, 4) %>%
fviz_cluster(., data = clean_data, axes = c(1, 3)) +
theme_bw() + labs(title = NULL)
g4 <- funClusters(clean_data, 4) %>%
fviz_cluster(., data = clean_data, axes = c(2, 3)) +
theme_bw() + labs(title = NULL)
g5 <- funClusters(clean_data, 4) %>%
fviz_cluster(., data = clean_data, axes = c(1, 4)) +
theme_bw() + labs(title = NULL)
ggt <- wrap_plots(g2, g3, g4, g5)
ggt
Clústers Kmeans visualizados en varios componentes
guardarGGplot(ggt, '005_cluz_group', 12, 8, fig_path)
funClusters_2 <- function(data, k) {
t1 <- data %>%
dist(method = 'euclidean') %>%
hclust(method = 'complete')
t2 <- cutree(t1, k)
return(list(clust = t1, tree = t2))
}
p1 <- plot(funClusters_2(clean_data, 3)$clust, cex = 0.6, hang = -1, ylab = 'Altura',
main = 'Dendrograma de clúster', xlab = NULL)
Dendrograma de análisis por clústers
pdf(file.path(fig_path, '010_dendrograma.pdf'), width = 12, height = 8)
plot(funClusters_2(clean_data, 3)$clust, cex = 0.6, hang = -1, ylab = 'Altura',
main = 'Dendrograma de clúster', xlab = NULL)
dev.off()
## png
## 2
saveRDS(p1, file.path(fig_path, '010_dendrograma.rds'))
gg1 <- funClusters_2(clean_data, 3)$clust$height %>%
as.tibble() %>%
add_column(groups = length(funClusters_2(clean_data, 3)$clust$height):1) %>%
rename(height = value) %>%
ggplot(aes(x=groups, y = height)) +
geom_point() + geom_line() +
geom_vline(xintercept = 5, lty = 'dashed', col = 'blue3') +
ylab('Altura') +
xlab('N.° de Clusters, k')
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
gg1
Criterio de codo para clústers jerárquicos
guardarGGplot(gg1, '012_elbowlWard', 6, 4, fig_path)
gg2 <- clean_data %>%
{fviz_cluster(list(data = ., cluster = funClusters_2(., 5)$tree))} +
theme_bw() + labs(title = NULL) +
scale_color_discrete(name = 'Clúster') +
scale_shape_discrete(name = 'Clúster') +
scale_fill_discrete(name = 'Clúster')
gg2
Clústers Jerárquicos visualizados en primeros dos componentes
guardarGGplot(gg2, '013_cluz_group', 8, 5, fig_path)
funClusters_3 <- function(axes) {
clean_data %>%
{fviz_cluster(list(data = ., cluster = funClusters_2(., 5)$tree),
axes = axes, ellipse = T)} +
theme_bw() + labs(title = NULL) +
scale_color_discrete(name = 'Clúster') +
scale_shape_discrete(name = 'Clúster') +
scale_fill_discrete(name = 'Clúster')
}
g2 <- funClusters_3(c(1,2))
g3 <- funClusters_3(c(1,3))
g4 <- funClusters_3(c(2,3))
g5 <- funClusters_3(c(1,4))
ggt <- wrap_plots(g2, g3, g4, g5)
ggt
Clústers jerarquizados visualizados en varios componentes
guardarGGplot(ggt, '014_cluz_group2', 12, 10, fig_path)
require(plotly)
trans_data <- as_tibble(pca1$x, rownames = 'Departamentos') %>%
left_join(funClusters_2(clean_data, 5)$tree %>%
as_tibble(rownames = 'Departamentos'), by = 'Departamentos') %>%
rename(Grupo_hclus = value) %>%
mutate(Grupo_hclus = as.integer(Grupo_hclus))
colors <- c('#4AC6B7', '#1972A4', '#965F8A', '#FF7070', '#C61951')
fig <- trans_data %>%
plot_ly(x = ~PC1, y = ~PC2, z = ~PC3, split = ~Grupo_hclus,
colors = colors, name = ~Grupo_hclus, text = ~Departamentos,
hovertemplate = "%{text}<br>PC1: %{x}<br>PC2: %{y}<br>PC3: %{z}")
if (knitr::is_html_output()) {
fig %>%
add_markers()
}
PCA3d
trans_data1 <- data %>%
left_join(funClusters_2(clean_data, 5)$tree %>%
as_tibble(rownames = 'Departamentos'),
by = c('Departamento...2' = 'Departamentos')) %>%
rename(Grupo_hclus = value, Departamentos = Departamento...2) %>%
mutate(Grupo_hclus = as.integer(Grupo_hclus))
fig1 <- trans_data1 %>%
plot_ly(x = ~No_Inscritos, y = ~`Cumplimiento_A2_2020-2021-06`,
z = ~PropPortafolio, split = ~Grupo_hclus,
colors = colors, name = ~Grupo_hclus, text = ~Departamentos,
hovertemplate = "%{text}<br>N.° inscritos: %{x}<br>Cumplimiento A2: %{y}<br>Prop. portafolio: %{z}")
if (knitr::is_html_output()) {
fig1 %>%
add_markers()
}
## Warning: Ignoring 2 observations
PCA3d-1